#Loading Packages
import pandas as pd
import nltk
import numpy as np
nltk.download('vader_lexicon')
nltk.download('punkt')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#Importing Comments
df1 = pd.read_csv('Data/Comments1.csv')
df2 = pd.read_csv('Data/Comments2.csv')
df3 = pd.read_csv('Data/Comments3.csv')
df4 = pd.read_csv('Data/Comments4.csv')
df5 = pd.read_csv('Data/Comments5.csv')
df6 = pd.read_csv('Data/Comments6.csv')
#Joining
comments = pd.concat([df1,df2,df3,df4,df5,df6])
comments.columns = ['Article_Title','Article_Author','Article_Publish_Date','Comment_Title','Comment_Body',
'Comment_Poster','Comment_Date','Comment_Time','Comment_Recs']
#List of unique Article Titles
names = comments.Article_Title.unique()
#Initialize the Vader Sentiment Tool
sid = SentimentIntensityAnalyzer()
Sid function provides the weighted scores for negative, neutral, and postitive words. It then gives a compound range from -1 to 1 for overall scores
#String pad to tripple ' for vader
test = comments.iloc[9,4]
scores = sid.polarity_scores(test)
test
scores
#creating a loop to find the average compound scores for articles
scores = []
date = []
ncomm = []
for i in range(0,len(names)):
if (i / 1000 == i // 1000) | (i == len(names)):
print(str(i)+' of '+str(len(names)))
#filter comments to the appropriate article, save the number of comments
temp = comments[comments.Article_Title == names[i]]
ncomm.append(temp.shape[0])
if temp.shape[0] != 0:
temp_scores = []
for j in range(0,temp.shape[0]):
#Grab comment
temp2 = temp.iloc[j,4]
#if no comment (nan), make it an empty string
if type(temp2) == float:
temp2 = ""
#removing leftovers from html
temp2 = temp2.replace('\r','').replace('\n','').replace('\p','')
#find score and save score into a temporary array
temp2 = sid.polarity_scores(temp2)
temp_scores.append(temp2['compound'])
#Save the date, and average score of all comments
if type(temp.iloc[0,2]) != float:
date.append(temp.iloc[0,2])
else:
date.append(temp.iloc[0,5])
scores.append(np.mean(temp_scores))
#Put it all into a dataframe
average_sent = pd.DataFrame([names,scores,date,ncomm])
average_sent = average_sent.T
average_sent.columns = ['Article','Av_Score','Date','Number_of_Comments']
#Sort by date
average_sent['Date'] = pd.to_datetime(average_sent.Date)
average_sent = average_sent.sort_values(by='Date')
#Plot it not in garbo mathplotlib
import plotly.graph_objects as go
import plotly.io as pio
rolling_mean = average_sent.Av_Score.rolling(window=20).mean()
pio.renderers
fig = go.Figure()
fig.add_trace(go.Scatter(x=average_sent.Date,y=average_sent.Av_Score,name='Average Sentiment by Article',
line=dict(color='blue')))
fig.add_trace(go.Scatter(x=average_sent.Date,y=rolling_mean,name='Rolling Average [20]',
line=dict(color='red')))
fig.update_layout(legend_orientation="h")
fig.show()
fig.show(renderer="svg")
#Trying to get the plots to show by saving/loading them as an html
import plotly
import IPython
from IPython.display import HTML
plotly.offline.plot(fig, filename='Plots/Av_Sent_by_Art.html',auto_open=False)
IPython.display.HTML(filename='Plots/Av_Sent_by_Art.html')
HTML(filename='Plots/Av_Sent_by_Art.html')
display(HTML(filename='Plots/Av_Sent_by_Art.html'))
# Some of the most positive and negative articles:
average_sent = average_sent[average_sent.Date > '2008-01-01'].sort_values(by='Av_Score')
average_sent.head(10)
average_sent = average_sent.sort_values(by='Av_Score',ascending=False)
average_sent.head(10)
#Redone with only articles that have more than 10 comments
average_sent_adj = average_sent[average_sent.Number_of_Comments > 10]
average_sent_adj = average_sent_adj.sort_values(by='Date')
rolling_mean = average_sent_adj.Av_Score.rolling(window=20).mean()
fig = go.Figure()
fig.add_trace(go.Scatter(x=average_sent_adj.Date,y=average_sent_adj.Av_Score,name='Average Sentiment by Article (Adjusted)',
line=dict(color='blue')))
fig.add_trace(go.Scatter(x=average_sent_adj.Date,y=rolling_mean,name='Rolling Average [20]',
line=dict(color='red')))
fig.update_layout(legend_orientation="h")
fig.show()
plotly.offline.plot(fig, filename='Plots/Av_Sent_by_Art_Adj.html',auto_open=False)
IPython.display.HTML(filename='Plots/Av_Sent_by_Art_Adj.html')
average_sent_adj = average_sent_adj.sort_values(by='Av_Score')
average_sent_adj.head(10)
average_sent_adj = average_sent_adj.sort_values(by='Av_Score',ascending=False)
average_sent_adj.head(10)